/*
Obtain the gendered amenity classifications and add them to the analysis samples (with industry and state fixed effects)
	// input: amenities_gender_sample, clauselabels
	// output: amenity_value_gender_ufind2, amenities_classification_DD_unfind2
*/

cap log close
cap log using "$logs/amenity_value_coeffs_ufind2_log", replace


*************
**LOAD DATA**
*************

* Sample for data-driven classification of clauses
use "$files\amenities_gender_sample", clear

* PageRank normalizations
gen fe_diff = abs(fe_m-fe_f)
gen pagerank_diff = abs(pagerank_m-pagerank_f)
qui sum fe_diff, d
list estabid fe_m fe_f fe_diff pagerank_m pagerank_f pagerank_diff if fe_diff==r(min)
qui sum pagerank_diff if fe_diff==r(min)
list estabid fe_m fe_f fe_diff pagerank_m pagerank_f pagerank_diff if pagerank_diff==r(min)
gen Diff = pagerank_diff if pagerank_diff==r(min)
gen X = pagerank_m if pagerank_diff==r(min)
gen Ratio = pagerank_m/pagerank_f if pagerank_diff==r(min)
gen B = Diff/(X*(Ratio-1))
gen A = (Diff/X)+B
gen pagerank_m_norm = pagerank_m
sum Ratio
gen pagerank_f_norm = pagerank_f*r(mean)

* PageRanks: logs of exp(V)
gen ln_pagerank_m = log(pagerank_m)
gen ln_pagerank_f = log(pagerank_f)
gen ln_pagerank_m_norm = log(pagerank_m_norm)
gen ln_pagerank_f_norm = log(pagerank_f_norm)

* Inverse hyporbolic sine of clauses
foreach x of varlist *cl_* {
	replace `x' = asinh(`x')
}

* Restrictions
//keep estab with at least 4 sectoral contracts over time frame considered
egen tag_estabid = tag(estabid)
sum cnt, d
tab cnt if tag_estabid==1
keep if cnt>=4
//random sample split
splitsample, generate(sample) nsplit(2) rseed(1234)
sort estabid

* Some descriptives
// gap in page ranks (with and without normalization)
gen gap_pagerank = ln_pagerank_f - ln_pagerank_m
gen gap_pagerank_norm = ln_pagerank_f_norm - ln_pagerank_m_norm
// index ranking of PageRanks (minimum at 0; maximum at 100)
qui sum pagerank_m
gen pagerank_m_index = (pagerank_m-r(min))*(100/(r(max)-r(min)))
qui sum pagerank_f
gen pagerank_f_index = (pagerank_f-r(min))*(100/(r(max)-r(min)))
gen gap_pagerank_index = pagerank_f_index - pagerank_m_index
//correlation of gaps
corr gap_pagerank*

* Clause labeling
//label clauses 
do "$raw/clauselabels.do" 	
//collinearity in clauses
sum cl_43pro_pro cl_51aut_tra	
drop cl_43pro_pro cl_51aut_tra	

* Geography and industry
gen uf = floor(municipality/1e4)
gen region = floor(municipality/1e5)
gen ind1 = floor(cnae20subcl/1e6)
gen ind2 = floor(cnae20subcl/1e5)
gen ind3 = floor(cnae20subcl/1e4)
	
	
**********************************************
*** select clauses with data driven method ***
**********************************************

** LASSO ON VALUE DIFF (method1)**
	
	* Adaptive lasso
	lasso linear gap_pagerank (i.uf i.ind2) fe_f fe_m cl_* if sample==1, selection(adaptive) rseed(1234)
	estimates store method1
	matrix b_method1 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m1 "`e(allvars_sel)'"

	* Evaluate
	lassocoef method1, sort(coef, standardized) nofvlabel
	lassogof method1, over(sample) postselection
	qui sum gap_pagerank if sample==1
	di "Variance in sample==1: `r(Var)'"
	qui sum gap_pagerank if sample==2
	di "Variance in sample==2: `r(Var)'"
	
	* Post-selection OLS (with geography and industry fixed effects)
	reghdfe gap_pagerank `selected_vars_m1' if sample==1, absorb(uf ind2)
	estimates store method1_ols 
	matrix b_method1_ols = e(b)'	

	* Export
	putexcel set "$tables\amenity_value_gender_ufind2.xlsx", sheet(method1) replace
	putexcel A1=matrix(b_method1), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
	putexcel D1=matrix(b_method1_ols), names nformat(number_d2)
	putexcel D1 = "Clause"
	putexcel E1 = "Coef (OLS)"	
	
	* Sanity check: see if something changes by flipping difference (it doesn't, as it should)
	gen flip_gap = - gap_pagerank
	lasso linear flip_gap (i.uf i.ind2) fe_f fe_m cl_* if sample==1, selection(adaptive) rseed(1234)
	lassocoef, display(coef, standardized) sort(coef, standardized)
	drop flip_gap
	
** LASSO ON VALUE DIFF, ENTIRE SAMPLE --> just to check the difference **
	
	* Adaptive lasso
	lasso linear gap_pagerank (i.uf i.ind2) fe_f fe_m cl_*, selection(adaptive) rseed(1234)
	estimates store method1_v2
	matrix b_method1 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m1 "`e(allvars_sel)'"

	* Export
	putexcel set "$tables\amenity_value_gender_ufind2.xlsx", sheet(method1_v2) modify
	putexcel A1=matrix(b_method1), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
** LASSO ON NORMALIZED VALUE DIFF (method2)**
	
	* Adaptive lasso
	lasso linear gap_pagerank_norm (i.uf i.ind2) fe_f fe_m cl_* if sample==1, selection(adaptive) rseed(1234)
	estimates store method2
	matrix b_method2 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m2 "`e(allvars_sel)'"

	* Evaluate
	lassocoef method2, sort(coef, standardized) nofvlabel
	lassogof method2, over(sample) postselection
	qui sum gap_pagerank_norm if sample==1
	di "Variance in sample==1: `r(Var)'"
	qui sum gap_pagerank_norm if sample==2
	di "Variance in sample==2: `r(Var)'"
	
	* Post-selection OLS (with geography and industry fixed effects)
	reghdfe gap_pagerank_norm `selected_vars_m2' if sample==1, absorb(uf ind2)
	estimates store method2_ols 
	matrix b_method2_ols = e(b)'	

	* Export
	putexcel set "$tables\amenity_value_gender_ufind2.xlsx", sheet(method2) modify
	putexcel A1=matrix(b_method2), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
	putexcel D1=matrix(b_method2_ols), names nformat(number_d2)
	putexcel D1 = "Clause"
	putexcel E1 = "Coef (OLS)"	
	
	* Sanity check: see if something changes by flipping difference (it doesn't, as it should)
	gen flip_gap = - gap_pagerank_norm
	lasso linear flip_gap (i.uf i.ind2) fe_f fe_m cl_* if sample==1, selection(adaptive) rseed(1234)
	lassocoef, display(coef, standardized) sort(coef, standardized)
	drop flip_gap
	
** LASSO ON NORMALIZED VALUE DIFF, ENTIRE SAMPLE --> just to check the difference **
	
	* Adaptive lasso
	lasso linear gap_pagerank_norm (i.uf i.ind2) fe_f fe_m cl_*, selection(adaptive) rseed(1234)
	estimates store method2_v2
	matrix b_method2 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m2 "`e(allvars_sel)'"

	* Export
	putexcel set "$tables\amenity_value_gender_ufind2.xlsx", sheet(method2_v2) modify
	putexcel A1=matrix(b_method2), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
** LASSO ON INDEX DIFF (method3)**
	
	* Adaptive lasso
	lasso linear gap_pagerank_index (i.uf i.ind2) fe_f fe_m cl_* if sample==1, selection(adaptive) rseed(1234)
	estimates store method3
	matrix b_method3 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m3 "`e(allvars_sel)'"

	* Evaluate
	lassocoef method3, sort(coef, standardized) nofvlabel
	lassogof method3, over(sample) postselection
	qui sum gap_pagerank_index if sample==1
	di "Variance in sample==1: `r(Var)'"
	qui sum gap_pagerank_index if sample==2
	di "Variance in sample==2: `r(Var)'"
	
	* Post-selection OLS (with geography and industry fixed effects)
	reghdfe gap_pagerank_index `selected_vars_m3' if sample==1, absorb(uf ind2)
	estimates store method3_ols 
	matrix b_method3_ols = e(b)'	

	* Export
	putexcel set "$tables\amenity_value_gender_ufind2.xlsx", sheet(method3) modify
	putexcel A1=matrix(b_method3), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
	putexcel D1=matrix(b_method3_ols), names nformat(number_d2)
	putexcel D1 = "Clause"
	putexcel E1 = "Coef (OLS)"	
	
	* Sanity check: see if something changes by flipping difference (it doesn't, as it should)
	gen flip_gap = - gap_pagerank_index
	lasso linear flip_gap (i.uf i.ind2) fe_f fe_m cl_* if sample==1, selection(adaptive) rseed(1234)
	lassocoef, display(coef, standardized) sort(coef, standardized)
	drop flip_gap
	
** LASSO ON INDEX DIFF, ENTIRE SAMPLE --> just to check the difference **
	
	* Adaptive lasso
	lasso linear gap_pagerank_index (i.uf i.ind2) fe_f fe_m cl_*, selection(adaptive) rseed(1234)
	estimates store method3_v2
	matrix b_method3 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m3 "`e(allvars_sel)'"

	* Export
	putexcel set "$tables\amenity_value_gender_ufind2.xlsx", sheet(method3_v2) modify
	putexcel A1=matrix(b_method3), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
	
********************************************
*** save clauses with data driven method ***
********************************************

* Method 1: values
import excel using "$tables\amenity_value_gender_ufind2.xlsx", sheet(method1) firstrow clear
keep  Clause Coef*
rename (Clause Coefstandardized CoefOLS) (clause coef coefols)
drop if inlist( clause, "fe_f", "fe_m", "_cons")
drop if strpos(clause, ".")>0
//correlation between LASSO ranking and OLS with FEs (all LASSO selected clauses)//
pwcorr coef coefols
spearman coef coefols
gsort - coef 
gen aux = _n 
gen isfemale_amenity_DD_m1 = (aux<=20)
gsort + coef 
drop aux
gen aux = _n 
gen ismale_amenity_DD_m1 = (aux<=20)
drop if (isfemale_amenity_DD_m1==0)&(ismale_amenity_DD_m1==0)
//correlation between LASSO ranking and OLS with FEs (female and male DD amenities)//
pwcorr coef coefols
spearman coef coefols
keep clause is*
duplicates drop
save "$files/amenities_classification_DD_ufind2", replace
//method1_v2
import excel using "$tables\amenity_value_gender_ufind2.xlsx", sheet(method1_v2) firstrow clear
keep  Clause Coef*
rename (Clause Coefstandardized) (clause coef)
drop if inlist( clause, "fe_f", "fe_m", "_cons")
drop if strpos(clause, ".")>0
gsort - coef 
gen aux = _n 
gen isfemale_amenity_DD_m1v2 = (aux<=20)
gsort + coef 
drop aux
gen aux = _n 
gen ismale_amenity_DD_m1v2 = (aux<=20)
drop if (isfemale_amenity_DD_m1v2==0)&(ismale_amenity_DD_m1v2==0)
keep clause is*
duplicates drop
merge 1:1 clause using "$files/amenities_classification_DD_ufind2"
drop _merge
save "$files/amenities_classification_DD_ufind2", replace

* Method 2: normalized values
import excel using "$tables\amenity_value_gender_ufind2.xlsx", sheet(method2) firstrow clear
keep  Clause Coef*
rename (Clause Coefstandardized CoefOLS) (clause coef coefols)
drop if inlist( clause, "fe_f", "fe_m", "_cons")
drop if strpos(clause, ".")>0
//correlation between LASSO ranking and OLS with FEs (all LASSO selected clauses)//
pwcorr coef coefols
spearman coef coefols
gsort - coef 
gen aux = _n 
gen isfemale_amenity_DD_m2 = (aux<=20)
gsort + coef 
drop aux
gen aux = _n 
gen ismale_amenity_DD_m2 = (aux<=20)
drop if (isfemale_amenity_DD_m2==0)&(ismale_amenity_DD_m2==0)
//correlation between LASSO ranking and OLS with FEs (female and male DD amenities)//
pwcorr coef coefols
spearman coef coefols
keep clause is*
duplicates drop
merge 1:1 clause using "$files/amenities_classification_DD_ufind2"
drop _merge
save "$files/amenities_classification_DD_ufind2", replace
//method2_v2
import excel using "$tables\amenity_value_gender_ufind2.xlsx", sheet(method2_v2) firstrow clear
keep  Clause Coef*
rename (Clause Coefstandardized) (clause coef)
drop if inlist( clause, "fe_f", "fe_m", "_cons")
drop if strpos(clause, ".")>0
gsort - coef 
gen aux = _n 
gen isfemale_amenity_DD_m2v2 = (aux<=20)
gsort + coef 
drop aux
gen aux = _n 
gen ismale_amenity_DD_m2v2 = (aux<=20)
drop if (isfemale_amenity_DD_m2v2==0)&(ismale_amenity_DD_m2v2==0)
keep clause is*
duplicates drop
merge 1:1 clause using "$files/amenities_classification_DD_ufind2"
drop _merge
save "$files/amenities_classification_DD_ufind2", replace

* Method 3: index ranking
import excel using "$tables\amenity_value_gender_ufind2.xlsx", sheet(method3) firstrow clear
keep  Clause Coef*
rename (Clause Coefstandardized CoefOLS) (clause coef coefols)
drop if inlist( clause, "fe_f", "fe_m", "_cons")
drop if strpos(clause, ".")>0
//correlation between LASSO ranking and OLS with FEs (all LASSO selected clauses)//
pwcorr coef coefols
spearman coef coefols
gsort - coef 
gen aux = _n 
gen isfemale_amenity_DD_m3 = (aux<=20)
gsort + coef 
drop aux
gen aux = _n 
gen ismale_amenity_DD_m3 = (aux<=20)
drop if (isfemale_amenity_DD_m3==0)&(ismale_amenity_DD_m3==0)
//correlation between LASSO ranking and OLS with FEs (female and male DD amenities)//
pwcorr coef coefols
spearman coef coefols
keep clause is*
duplicates drop
merge 1:1 clause using "$files/amenities_classification_DD_ufind2"
drop _merge
save "$files/amenities_classification_DD_ufind2", replace
//method3_v2
import excel using "$tables\amenity_value_gender_ufind2.xlsx", sheet(method3_v2) firstrow clear
keep  Clause Coef*
rename (Clause Coefstandardized) (clause coef)
drop if inlist( clause, "fe_f", "fe_m", "_cons")
drop if strpos(clause, ".")>0
gsort - coef 
gen aux = _n 
gen isfemale_amenity_DD_m3v2 = (aux<=20)
gsort + coef 
drop aux
gen aux = _n 
gen ismale_amenity_DD_m3v2 = (aux<=20)
drop if (isfemale_amenity_DD_m3v2==0)&(ismale_amenity_DD_m3v2==0)
keep clause is*
duplicates drop
merge 1:1 clause using "$files/amenities_classification_DD_ufind2"
drop _merge
save "$files/amenities_classification_DD_ufind2", replace

* Aggregate
foreach xxx of varlist is* {
	replace `xxx' = 0 if `xxx'==.
}
egen cnt_female = rowtotal(isfemale*) 
tab cnt_female if cnt_female>0
egen cnt_male = rowtotal(ismale*) 
tab cnt_male if cnt_male>0
save "$files/amenities_classification_DD_ufind2", replace

log close
